In [1]:

    
import sys
sys.path.append('/Users/erickpeirson/tethne')



In [2]:

    
import matplotlib.pyplot as plt

1. Create a corpus from a JSTOR DfR dataset

1.1. Load bibliographic data



In [3]:

    
from tethne.readers import dfr



In [4]:

    
datapath = ['/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.cHrmED8A',
            '/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9',
            '/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9']



In [5]:

    
outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldaout'
temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldatemp'



In [6]:

    
papers = [ p for path in datapath for p in dfr.read(path) ]



In [7]:

    
len(papers)









    Out[7]:





880

1.2. Load wordcounts



In [8]:

    
wordcounts = {}
for path in datapath:
    w = dfr.ngrams(path, 'uni')
    wordcounts.update(w)

1.3. Load NLTK stoplist



In [9]:

    
from nltk.corpus import stopwords



In [10]:

    
stoplist = stopwords.words()

1.4. Create a Corpus



In [11]:

    
from tethne import Corpus



In [12]:

    
D = Corpus(papers, features={'wordcounts': wordcounts}, index_by='doi', exclude=stoplist)

1.5. Filter words in wordcount featureset



In [13]:

    
def filt(s, C, DC):
    if C > 3 and DC > 1 and len(s) > 3:
        return True
    return False



In [14]:

    
D.filter_features('wordcounts', 'wordcounts_filtered', filt)



In [15]:

    
len(D.features['wordcounts']['index']), len(D.features['wordcounts_filtered']['index'])









    Out[15]:





(122836, 27750)

1.6. Create a time-period index



In [16]:

    
D.slice('date', method='time_period', window_size=5)



In [17]:

    
D.plot_distribution('date')









    



[1921, 1926, 1931, 1936, 1941, 1946, 1951, 1956, 1961, 1966, 1971, 1976]



In [18]:

    
D.slice('jtitle')



In [21]:

    
D.plot_distribution('date', 'jtitle', aspect=0.1, interpolation='none')









    



[1921, 1926, 1931, 1936, 1941, 1946, 1951, 1956, 1961, 1966, 1971, 1976]
[0, 2, 4, 6, 8, 10]



In [23]:

    
D.get_by([('date',1946), ('date',1951)], include_papers=False)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-23-22d13fb7070d> in <module>()
----> 1 D.get_by([('date',1946), ('date',1951)])

/Users/erickpeirson/tethne/tethne/classes/corpus.py in get_by(self, key_indices, include_papers)
    820             A list of paper indices, or :class:`.Paper` instances.
    821 
--> 822         """
    823 
    824         if len(self.axes) == 0:

NameError: global name 'papers' is not defined



In [19]:

    
from tethne.model.managers import DTMModelManager



In [20]:

    
dtm_path = '/Users/erickpeirson/tethne/tethne/model/bin/main'



In [21]:

    
dtm_outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmout'
dtm_temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmtemp'



In [22]:

    
DM = DTMModelManager(D, feature='wordcounts_filtered', outpath=dtm_outpath, 
                        temppath=dtm_temppath, dtm_path=dtm_path)



In [23]:

    
DM.prep()



In [24]:

    
DM.build()









    Out[24]:





<tethne.model.corpus.dtmmodel.DTMModel at 0x10db6d1d0>



In [25]:

    
import cPickle as pickle



In [26]:

    
with open('/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmout/DTMModelManager.pickle', 'wb') as f:
    pickle.dump(DM, f)



In [28]:

    
DM.list_topic_diachronic(1)









    Out[28]:





{0: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'pollination'],
 1: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'pollination'],
 2: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'subsp'],
 3: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'subsp'],
 4: ['pollen',
  'species',
  'flowers',
  'grains',
  'plants',
  'number',
  'embryo',
  'perenne',
  'subsp',
  'flower'],
 5: ['pollen',
  'species',
  'flowers',
  'plants',
  'grains',
  'number',
  'perenne',
  'subsp',
  'embryo',
  'flower'],
 6: ['pollen',
  'species',
  'flowers',
  'perenne',
  'plants',
  'number',
  'subsp',
  'grains',
  'female',
  'flower'],
 7: ['pollen',
  'species',
  'perenne',
  'flowers',
  'subsp',
  'plants',
  'number',
  'female',
  'chromosome',
  'grains'],
 8: ['pollen',
  'species',
  'flowers',
  'plants',
  'female',
  'number',
  'chromosome',
  'male',
  'subsp',
  'perenne'],
 9: ['species',
  'pollen',
  'female',
  'plants',
  'number',
  'flowers',
  'male',
  'chromosome',
  'pollination',
  'table'],
 10: ['species',
  'pollen',
  'chromosome',
  'number',
  'plants',
  'female',
  'flowers',
  'male',
  'table',
  'pollination'],
 11: ['species',
  'pollen',
  'chromosome',
  'number',
  'plants',
  'female',
  'table',
  'flowers',
  'numbers',
  'male']}



In [30]:

    
import networkx as nx



In [43]:

    
g = nx.Graph(name='my graph')



In [47]:

    
g.add_edge(1,3, weight=0.5)



In [49]:

    
g.add_node(1, size=0.3)



In [50]:

    
g.__dict__









    Out[50]:





{'adj': {0: {3: {'weight': 0.5}},
  1: {3: {'weight': 0.5}},
  3: {0: {'weight': 0.5}, 1: {'weight': 0.5}}},
 'edge': {0: {3: {'weight': 0.5}},
  1: {3: {'weight': 0.5}},
  3: {0: {'weight': 0.5}, 1: {'weight': 0.5}}},
 'graph': {'name': 'my graph'},
 'node': {0: {}, 1: {'size': 0.3}, 3: {}}}



In [51]:

    
from scipy.sparse import coo_matrix



In [191]:

    
I = [0,1,2,3,3]
J = [1,1,3,0,1]
K = [1, 2, 3, 4, 5]



In [192]:

    
A = coo_matrix((K, (I, J)))



In [211]:

    
zip(A.nonzero()[0], A.nonzero()[1])









    Out[211]:





[(0, 1), (1, 1), (2, 3), (3, 0), (3, 1)]



In [194]:

    
B = A.tocsr()



In [195]:

    
B[0,1]









    Out[195]:





1



In [196]:

    
C = A.tolil()



In [197]:

    
A.nonzero()[0]









    Out[197]:





array([0, 1, 2, 3, 3], dtype=int32)



In [198]:

    
C.nonzero()









    Out[198]:





(array([0, 1, 2, 3, 3], dtype=int32), array([1, 1, 3, 0, 1], dtype=int32))



In [203]:

    
list(set(B[0,:].nonzero()[1]) | set(B[:,0].nonzero()[0]))









    Out[203]:





[1, 3]



In [200]:

    
A.nonzero()









    Out[200]:





(array([0, 1, 2, 3, 3], dtype=int32), array([1, 1, 3, 0, 1], dtype=int32))



In [201]:

    
class SA(object):
    def __getitem__(self, indices):
        i, j = indices
        print i, j



In [202]:

    
SA()[0,1]

0 1



In [214]:

    
g.edges(data=True)









    Out[214]:





[(0, 3, {'weight': 0.5}), (1, 3, {'weight': 0.5})]



In [216]:

    
A.data









    Out[216]:





array([1, 2, 3, 4, 5])



In [218]:

    
g.edge









    Out[218]:





{0: {3: {'weight': 0.5}},
 1: {3: {'weight': 0.5}},
 3: {0: {'weight': 0.5}, 1: {'weight': 0.5}}}



In [219]:

    
from tethne.persistence.hdf5.graphcollection import HDF5Graph









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-219-b621458adbac> in <module>()
----> 1 from tethne.persistence.hdf5.graphcollection import HDF5Graph

ImportError: No module named graphcollection



In [ ]: